import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
files =[file for file in os.listdir("D:\projects\sales\Sales_Data")]
for file in files:
print(file)
all_data.csv Sales_April_2019.csv Sales_August_2019.csv Sales_December_2019.csv Sales_February_2019.csv Sales_January_2019.csv Sales_July_2019.csv Sales_June_2019.csv Sales_March_2019.csv Sales_May_2019.csv Sales_November_2019.csv Sales_October_2019.csv Sales_September_2019.csv
path = "D:\projects\sales\Sales_Data"
#blank dataframe
all_data = pd.DataFrame()
for file in files:
current_df = pd.read_csv(path+"/"+file)
all_data = pd.concat([all_data, current_df])
all_data.shape
(373700, 6)
all_data.to_csv('D:\projects\sales\Sales_Data/all_data.csv',index=False)
all_data.dtypes
Order ID object Product object Quantity Ordered object Price Each object Order Date object Purchase Address object dtype: object
all_data.head()
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 |
| 3 | 176560 | Google Phone | 1 | 600 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
| 4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
all_data.isnull().sum()
Order ID 0 Product 0 Quantity Ordered 0 Price Each 0 Order Date 0 Purchase Address 0 Month 0 dtype: int64
all_data = all_data.dropna(how='all')
all_data.shape
(371900, 7)
'04/19/19 08:46'.split('/')[0]
'04'
def month(x):
return x.split('/')[0]
all_data['Month']=all_data['Order Date'].apply(month)
all_data.dtypes
Order ID object Product object Quantity Ordered object Price Each object Order Date object Purchase Address object Month object dtype: object
all_data['Month']=all_data['Month'].astype(int)
all_data['Month'].unique()
array([ 4, 5, 8, 9, 12, 1, 2, 3, 7, 6, 11, 10])
filter=all_data['Month']=='Order Date'
len(all_data[~filter])
371900
all_data=all_data[~filter]
all_data.shape
(371900, 7)
all_data.head()
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | |
|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 |
| 2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 |
| 3 | 176560 | Google Phone | 1 | 600 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 |
| 4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 |
| 5 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 |
all_data['Month']=all_data['Month'].astype(int)
all_data.dtypes
Order ID object Product object Quantity Ordered object Price Each object Order Date object Purchase Address object Month int32 dtype: object
all_data['Price Each']=all_data['Price Each'].astype(float)
all_data['Quantity Ordered']=all_data['Quantity Ordered'].astype(int)
all_data['sales']=all_data['Quantity Ordered']*all_data['Price Each']
all_data.head(5)
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | sales | |
|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 |
| 2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 |
| 3 | 176560 | Google Phone | 1 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 |
| 4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 |
| 5 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 |
all_data.groupby('Month')['sales'].sum()
Month 1 3644513.46 2 4404044.84 3 5614200.76 4 6781340.48 5 6305213.50 6 5155604.52 7 5295551.52 8 4488935.76 9 4195120.26 10 7473453.76 11 6399206.40 12 9226886.68 Name: sales, dtype: float64
months=range(1,13)
plt.bar(months,all_data.groupby('Month')['sales'].sum())
plt.xticks(months)
plt.ylabel('Sales in USD ($)')
plt.xlabel('Month number')
plt.show()
'917 1st St, Dallas, TX 75001'.split(',')[1]
' Dallas'
def city(x):
return x.split(',')[1]
all_data['city']=all_data['Purchase Address'].apply(city)
all_data.groupby('city')['city'].count()
city Atlanta 29762 Austin 19810 Boston 39868 Dallas 29640 Los Angeles 59210 New York City 49752 Portland 24930 San Francisco 89464 Seattle 29464 Name: city, dtype: int64
plt.bar(all_data.groupby('city')['city'].count().index,all_data.groupby('city')['city'].count())
plt.xticks(rotation='vertical')
plt.ylabel('received orders')
plt.xlabel('city names')
plt.show()
all_data['Order Date'][0].dtype
dtype('O')
all_data['Hour'] = pd.to_datetime(all_data['Order Date']).dt.hour
keys=[]
hour=[]
for key,hour_df in all_data.groupby('Hour'):
keys.append(key)
hour.append(len(hour_df))
plt.grid()
plt.plot(keys,hour)
[<matplotlib.lines.Line2D at 0x2738f181400>]
all_data.groupby('Product')['Quantity Ordered'].sum().plot(kind='bar')
<AxesSubplot:xlabel='Product'>
all_data.groupby('Product')['Price Each'].mean()
Product 20in Monitor 109.99 27in 4K Gaming Monitor 389.99 27in FHD Monitor 149.99 34in Ultrawide Monitor 379.99 AA Batteries (4-pack) 3.84 AAA Batteries (4-pack) 2.99 Apple Airpods Headphones 150.00 Bose SoundSport Headphones 99.99 Flatscreen TV 300.00 Google Phone 600.00 LG Dryer 600.00 LG Washing Machine 600.00 Lightning Charging Cable 14.95 Macbook Pro Laptop 1700.00 ThinkPad Laptop 999.99 USB-C Charging Cable 11.95 Vareebadd Phone 400.00 Wired Headphones 11.99 iPhone 700.00 Name: Price Each, dtype: float64
products=all_data.groupby('Product')['Quantity Ordered'].sum().index
quantity=all_data.groupby('Product')['Quantity Ordered'].sum()
prices=all_data.groupby('Product')['Price Each'].mean()
plt.figure(figsize=(40,24))
fig,ax1 = plt.subplots()
ax2=ax1.twinx()
ax1.bar(products, quantity, color='g')
ax2.plot(products, prices, 'b-')
ax1.set_xticklabels(products, rotation='vertical', size=8)
C:\Users\anujd\AppData\Local\Temp\ipykernel_21184\2263540929.py:6: UserWarning: FixedFormatter should only be used together with FixedLocator ax1.set_xticklabels(products, rotation='vertical', size=8)
[Text(0, 0, '20in Monitor'), Text(1, 0, '27in 4K Gaming Monitor'), Text(2, 0, '27in FHD Monitor'), Text(3, 0, '34in Ultrawide Monitor'), Text(4, 0, 'AA Batteries (4-pack)'), Text(5, 0, 'AAA Batteries (4-pack)'), Text(6, 0, 'Apple Airpods Headphones'), Text(7, 0, 'Bose SoundSport Headphones'), Text(8, 0, 'Flatscreen TV'), Text(9, 0, 'Google Phone'), Text(10, 0, 'LG Dryer'), Text(11, 0, 'LG Washing Machine'), Text(12, 0, 'Lightning Charging Cable'), Text(13, 0, 'Macbook Pro Laptop'), Text(14, 0, 'ThinkPad Laptop'), Text(15, 0, 'USB-C Charging Cable'), Text(16, 0, 'Vareebadd Phone'), Text(17, 0, 'Wired Headphones'), Text(18, 0, 'iPhone')]
<Figure size 4000x2400 with 0 Axes>
all_data.shape
(371900, 10)
df=all_data[all_data['Order ID'].duplicated(keep=False)]
df.head(20)
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | sales | city | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas | 8 |
| 2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston | 22 |
| 3 | 176560 | Google Phone | 1 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles | 14 |
| 4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 14 |
| 5 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 9 |
| 6 | 176562 | USB-C Charging Cable | 1 | 11.95 | 04/29/19 13:03 | 381 Wilson St, San Francisco, CA 94016 | 4 | 11.95 | San Francisco | 13 |
| 7 | 176563 | Bose SoundSport Headphones | 1 | 99.99 | 04/02/19 07:46 | 668 Center St, Seattle, WA 98101 | 4 | 99.99 | Seattle | 7 |
| 8 | 176564 | USB-C Charging Cable | 1 | 11.95 | 04/12/19 10:58 | 790 Ridge St, Atlanta, GA 30301 | 4 | 11.95 | Atlanta | 10 |
| 9 | 176565 | Macbook Pro Laptop | 1 | 1700.00 | 04/24/19 10:38 | 915 Willow St, San Francisco, CA 94016 | 4 | 1700.00 | San Francisco | 10 |
| 10 | 176566 | Wired Headphones | 1 | 11.99 | 04/08/19 14:05 | 83 7th St, Boston, MA 02215 | 4 | 11.99 | Boston | 14 |
| 11 | 176567 | Google Phone | 1 | 600.00 | 04/18/19 17:18 | 444 7th St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles | 17 |
| 12 | 176568 | Lightning Charging Cable | 1 | 14.95 | 04/15/19 12:18 | 438 Elm St, Seattle, WA 98101 | 4 | 14.95 | Seattle | 12 |
| 13 | 176569 | 27in 4K Gaming Monitor | 1 | 389.99 | 04/16/19 19:23 | 657 Hill St, Dallas, TX 75001 | 4 | 389.99 | Dallas | 19 |
| 14 | 176570 | AA Batteries (4-pack) | 1 | 3.84 | 04/22/19 15:09 | 186 12th St, Dallas, TX 75001 | 4 | 3.84 | Dallas | 15 |
| 15 | 176571 | Lightning Charging Cable | 1 | 14.95 | 04/19/19 14:29 | 253 Johnson St, Atlanta, GA 30301 | 4 | 14.95 | Atlanta | 14 |
| 16 | 176572 | Apple Airpods Headphones | 1 | 150.00 | 04/04/19 20:30 | 149 Dogwood St, New York City, NY 10001 | 4 | 150.00 | New York City | 20 |
| 17 | 176573 | USB-C Charging Cable | 1 | 11.95 | 04/27/19 18:41 | 214 Chestnut St, San Francisco, CA 94016 | 4 | 11.95 | San Francisco | 18 |
| 18 | 176574 | Google Phone | 1 | 600.00 | 04/03/19 19:42 | 20 Hill St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles | 19 |
| 19 | 176574 | USB-C Charging Cable | 1 | 11.95 | 04/03/19 19:42 | 20 Hill St, Los Angeles, CA 90001 | 4 | 11.95 | Los Angeles | 19 |
| 20 | 176575 | AAA Batteries (4-pack) | 1 | 2.99 | 04/27/19 00:30 | 433 Hill St, New York City, NY 10001 | 4 | 2.99 | New York City | 0 |
#create grouped col
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df.head()
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | sales | city | Hour | Grouped | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas | 8 | USB-C Charging Cable,USB-C Charging Cable |
| 2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston | 22 | Bose SoundSport Headphones,Bose SoundSport Hea... |
| 3 | 176560 | Google Phone | 1 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles | 14 | Google Phone,Wired Headphones,Google Phone,Wir... |
| 4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 14 | Google Phone,Wired Headphones,Google Phone,Wir... |
| 5 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles | 9 | Wired Headphones,Wired Headphones |
df.shape
(371900, 11)
#lets drop out all duplicate Order ID
df2 = df.drop_duplicates(subset=['Order ID'])
df2['Grouped'].value_counts()[0:5].plot.pie()
<AxesSubplot:ylabel='Grouped'>
import plotly.graph_objs as go
from plotly.offline import iplot
values=df2['Grouped'].value_counts()[0:5]
labels=df['Grouped'].value_counts()[0:5].index
trace=go.Pie(labels=labels, values=values,
hoverinfo='label+percent', textinfo='value',
textfont=dict(size=25),
pull=[0, 0, 0,0.2, 0]
)
iplot([trace])